. Titanic dataset contains 891 rows and 12 columns.
. DataSet is downloaded from kaggle
. Notebook contains the indepth analysis of dataset including Data Cleaning, data wrangling, Data Visualization, Probability, Descriptive and Inferential Statistics.
Importing Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggthemes)
library(tidyr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
Loading and viewing the structure of data
titanic <- read.csv("C:/Users/Hp/Desktop/titanic.csv",sep = ",",stringsAsFactors = TRUE,na.strings = NA,header = T)
str(titanic)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
dim(titanic)
## [1] 891 12
summary(titanic)
## PassengerId Survived Pclass
## Min. : 1.0 Min. :0.0000 Min. :1.000
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000
## Median :446.0 Median :0.0000 Median :3.000
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Name Sex Age
## Abbing, Mr. Anthony : 1 female:314 Min. : 0.42
## Abbott, Mr. Rossmore Edward : 1 male :577 1st Qu.:20.12
## Abbott, Mrs. Stanton (Rosa Hunt) : 1 Median :28.00
## Abelson, Mr. Samuel : 1 Mean :29.70
## Abelson, Mrs. Samuel (Hannah Wizosky): 1 3rd Qu.:38.00
## Adahl, Mr. Mauritz Nils Martin : 1 Max. :80.00
## (Other) :885 NA's :177
## SibSp Parch Ticket Fare
## Min. :0.000 Min. :0.0000 1601 : 7 Min. : 0.00
## 1st Qu.:0.000 1st Qu.:0.0000 347082 : 7 1st Qu.: 7.91
## Median :0.000 Median :0.0000 CA. 2343: 7 Median : 14.45
## Mean :0.523 Mean :0.3816 3101295 : 6 Mean : 32.20
## 3rd Qu.:1.000 3rd Qu.:0.0000 347088 : 6 3rd Qu.: 31.00
## Max. :8.000 Max. :6.0000 CA 2144 : 6 Max. :512.33
## (Other) :852
## Cabin Embarked
## :687 : 2
## B96 B98 : 4 C:168
## C23 C25 C27: 4 Q: 77
## G6 : 4 S:644
## C22 C26 : 3
## D : 3
## (Other) :186
Cleaning and changing the schema of data
# Removing the NULL values
titanic<- na.omit(titanic)
dim(titanic)
## [1] 714 12
#We can use sapply function to get the nº of missing values in our dataset
sapply(titanic,function(x) sum(is.na(x)))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
# Converting to Factors
titanic$Pclass <- as.factor(titanic$Pclass)
titanic$Survived <- as.factor(titanic$Survived)
titanic$Sex <- as.factor(titanic$Sex)
titanic$Embarked <- as.factor(titanic$Embarked)
titanic$Survived<-ifelse(titanic$Survived==0,"Not Survivied","Survived")
titanic$Embarked<- ifelse(titanic$Embarked=="C","Cherbourg",(ifelse(titanic$Embarked=="Q","Queenstown","vSouthampton")))
Q1) What was the survival rate by gender?
###Using DPLYR
Survival_Count<-titanic %>%
select(Sex,Survived) %>%
group_by(Sex,Survived) %>%
summarise(total = n())
Survival_Count
Sex <fctr> | Survived <chr> | total <int> | ||
|---|---|---|---|---|
| female | Not Survivied | 64 | ||
| female | Survived | 197 | ||
| male | Not Survivied | 360 | ||
| male | Survived | 93 |
### Using GGPLOT2
g2<- ggplot(titanic, aes(x = Sex, fill = Survived)) +
theme_economist() +
geom_bar() +
labs(y = "Passenger Count",
title = "Titanic Survival Rates by Sex") +
scale_y_continuous(limits = c(0,500),breaks = seq(0,500,100))
ggplotly(g2)
Q2) What was the survival rate by class of ticket?
### Using DPLYR
Survival_Count_By_Class <- titanic %>%
select(Pclass,Survived) %>%
group_by(Pclass,Survived) %>%
summarise(Total= n())
Survival_Count_By_Class
Pclass <fctr> | Survived <chr> | Total <int> | ||
|---|---|---|---|---|
| 1 | Not Survivied | 64 | ||
| 1 | Survived | 122 | ||
| 2 | Not Survivied | 90 | ||
| 2 | Survived | 83 | ||
| 3 | Not Survivied | 270 | ||
| 3 | Survived | 85 |
### Using GGPLOT2
g3<-ggplot(titanic, aes(x = Pclass, fill = Survived)) +
theme_solarized() +
geom_bar() +
labs(y = "Passenger Count",
title = "Titanic Survival Rates by Pclass") +
scale_y_continuous(limits = c(0,400))
ggplotly(g3)
Q3) What was the survival rate?
g1<-ggplot(titanic, aes(x = Survived)) +
theme_wsj()+
geom_bar(fill=rainbow(2),col="black") +
labs(y = "Passenger Count",
title = "Titanic Survival Rates")
ggplotly(g1)
Q4) What is the distribution of passenger ages?
### using DPLYR
age_distribution <- titanic %>%
select(Age) %>%
mutate(distribution = cut(Age, breaks = seq(0,80,20)))%>%
group_by(distribution) %>%
summarise(total_count= n())
age_distribution <- na.omit(age_distribution)
age_distribution
distribution <fctr> | total_count <int> | |||
|---|---|---|---|---|
| (0,20] | 179 | |||
| (20,40] | 385 | |||
| (40,60] | 128 | |||
| (60,80] | 22 |
### USing GGPLOT2
g5<-ggplot(titanic, aes(x = Age)) +
theme_stata() +
geom_histogram(binwidth = 5,col="black",fill=rainbow(17))+
labs(y = "Passenger Count",
x = "Age (binwidth = 5)",
title = "Titanic Age Distribtion")
ggplotly(g5)
We can show by Density Graph also
g6<-ggplot(titanic, aes(x = Age)) +
theme_dark() +
geom_density(alpha = 0.5) +
labs(y = "Passenger Count",
x = "Age (binwidth = 5)",
title = "Titanic Age Distribtion")
ggplotly(g6)
Q5) What was the survival rate by class of ticket and gender?
##Using DPLYR
Survival_Count_By_Sex_And_Class<- titanic %>%
select(Sex,Survived,Pclass) %>%
group_by(Pclass,Sex,Survived) %>%
summarise(Survival_Count = n())
Survival_Count_By_Sex_And_Class
Pclass <fctr> | Sex <fctr> | Survived <chr> | Survival_Count <int> | |
|---|---|---|---|---|
| 1 | female | Not Survivied | 3 | |
| 1 | female | Survived | 82 | |
| 1 | male | Not Survivied | 61 | |
| 1 | male | Survived | 40 | |
| 2 | female | Not Survivied | 6 | |
| 2 | female | Survived | 68 | |
| 2 | male | Not Survivied | 84 | |
| 2 | male | Survived | 15 | |
| 3 | female | Not Survivied | 55 | |
| 3 | female | Survived | 47 |
### Using GGPLOT2
g4<-ggplot(titanic, aes(x = Sex, fill = Survived)) +
theme_base() +
facet_wrap(~ Pclass) +
geom_bar(col="black") +
labs(y = "Passenger Count",
title = "Titanic Survival Rates by Pclass and Sex")
ggplotly(g4)
Q6) What are the survival rates by age?
titanic$Age<- round(titanic$Age)
g7<- ggplot(titanic, aes(x = Age, fill = Survived)) +
theme_economist_white() +
geom_histogram(bins = 30,col="black")+
labs(y = "Passenger Count",
x = "Age (binwidth = 5)",
title = "Titanic Survival Rates by Age")
ggplotly(g7)
#Distributions can even be shown using points
g8<-ggplot(age_distribution,aes(distribution,total_count,fill=distribution)) +
theme_base() +
geom_count()+
labs(x = "Age Distribution",
title = "Distribution by Total Counts")
ggplotly(g8)
We can See Box Plot for more details
g9<-ggplot(titanic, aes(x = Survived, y = Age,fill=Survived)) +
theme_excel()+
geom_boxplot() +
labs(y = "Age",
x = "Survived",
title = "Titanic Survival Rates by Age")
ggplotly(g9)
g10<- ggplot(titanic,aes(titanic$Survived,titanic$Age,fill=titanic$Survived)) +
theme_excel_new()+
geom_abline(intercept =median(titanic$Age)) +
geom_violin() +
labs(y = "Age distribution",
title = "Titanic Survival Rates by Age")
ggplotly(g10)
Q34)Distribution based on Gender age fare survival
g36<-ggplot(titanic, aes(x =Fare,y=Age)) +
theme_stata() +
facet_wrap(~Sex) +
geom_point(aes(col=Survived),position = "jitter",pch=8,cex=1.5) +
geom_smooth(method = "lm",se=F)+
geom_hline(yintercept = median(titanic$Age))+
labs(y = "Age",
x = "Fare",
title = "Titanic Age Vs Fare wrt Sex and Survived")
ggplotly(g36)
1)=> By seeing the plot we can see there is a horizontal line representing median of Age.
2)=> We can see the distribution of Fare and relation between Fare and Age using a Regression Line.
3)=> By seeing the regression line we can distinguish that males have paid more than females as the regression line is always above the median for males.
4)=> We can also see females are more survived compared to males.
Q7) What is the survival rates by age when segmented by gender and class of ticket?
g11<- ggplot(titanic, aes(x = Age, fill = Survived)) +
theme_stata() +
facet_wrap(Sex ~ Pclass) +
geom_density(alpha = 0.5) +
labs(y = "Total Count",
x = "Age",
title = "Titanic Survival Rates by Age, Pclass and Sex")
ggplotly(g11)
=> By seeing the graph of survival rate by Age,PClass and Sex, We can conclude that Survival count of female is more than male and specially in the range betwen (20,40) more womens survived but more males died between the same range.
Q8) Top 10 people who paid highest ticket price
top_10_ticket_price_payers <- titanic %>%
select(Name,Fare)%>%
arrange(desc(Fare)) %>%
top_n(10)
## Selecting by Fare
top_10_ticket_price_payers$Fare<-round(top_10_ticket_price_payers$Fare)
top_10_ticket_price_payers
Name <fctr> | Fare <dbl> | |||
|---|---|---|---|---|
| Ward, Miss. Anna | 512 | |||
| Cardeza, Mr. Thomas Drake Martinez | 512 | |||
| Lesurer, Mr. Gustave J | 512 | |||
| Fortune, Mr. Charles Alexander | 263 | |||
| Fortune, Miss. Mabel Helen | 263 | |||
| Fortune, Miss. Alice Elizabeth | 263 | |||
| Fortune, Mr. Mark | 263 | |||
| Ryerson, Miss. Emily Borie | 262 | |||
| Ryerson, Miss. Susan Parker "Suzette" | 262 | |||
| Baxter, Mr. Quigg Edmond | 248 |
Q9)Total people Embarked from each place
g12<-ggplot(titanic,aes(x =Embarked)) +
theme_foundation() +
geom_bar(col="black",aes(fill=Embarked))+
labs(title = "Distribuion of Embarked")
ggplotly(g12)
=> By seeing the graph of Embarked, We can conclude that most of the people boarded titanic from vSouthampton.
Q10) Checking Relationship between Age and Fare
sample_titanic1<- titanic[1:400,]
g13<- ggplot(sample_titanic1,aes(x=Age,y=Fare)) +
theme_dark()+
geom_point() +
geom_smooth(method = lm)+
labs(title = "Distribution of Age and Fare")
ggplotly(g13)
# We can also see the corelation coeffcient to see the relation
cor(sample_titanic1$Age,sample_titanic1$Fare)
## [1] 0.09007522
We can see there is no a huge change b/w age and fare. As the corelation coeffcient is .09 so there is no Relation between Fare and Age
Q11)Distribution between Age and Survived
g14<- ggplot(titanic, aes(x = Age, fill = Survived)) +
theme_economist()+
geom_histogram(binwidth = 5,col="black") +
labs(y = "Passenger Count",
x = "Age (binwidth = 5)",
title = "Titanic Survival Rates by Age")
ggplotly(g14)
Q12)Distribution of PClass and fare
fare_category <-cut(titanic$Fare,breaks = c(0,100,250,512),labels = c("Silver Price","Golden Price","Premium Price"))
table(fare_category)
## fare_category
## Silver Price Golden Price Premium Price
## 659 39 6
titanic$fare_category<- fare_category
titanic<- na.omit(titanic)
g15<- ggplot(titanic) +
theme_base()+
geom_bar(aes(x=fare_category,fill=Pclass))+
labs(ylab="Fair Distribution",title="Fare Distribution Vs Pclass")
ggplotly(g15)
1)=> By seeing the graph, We can see the Silver Tickets are brought more than others and PClass 3 are more.
2)=> Other important result we see , as Silver Ticket price is between (0,100) still we see PClass 1 and PClass 2 tickets in this price which tells us their is Fare discrimination based on PClass Tickets as for such a small amount we are getting PClass 1 ticket at some Embarked.
Q13)Distribution of Sex and fare
g16<-ggplot(titanic) +
theme_economist_white()+
geom_bar(aes(x=fare_category,fill=Sex))+
labs(ylab="Fair Distribution", title = "Distribution of Fare by Sex")
ggplotly(g16)
1)=> By seeing the graph, We can see Males have brought more Silver Tickets than Females.
2)=> Females have brought more Golden Tickets than Males.
3)=> Premium tickets are very less brought.
Q14)Top 10 Aged People in Male
Top_10_Aged_Male<- titanic %>%
select(Name,Sex,Age) %>%
filter(Sex=="male") %>%
arrange(desc(Age)) %>%
select(Name,Age) %>%
top_n(10)
## Selecting by Age
Top_10_Aged_Male$Age<- round(Top_10_Aged_Male$Age)
Top_10_Aged_Male
Name <fctr> | Age <dbl> | |||
|---|---|---|---|---|
| Barkworth, Mr. Algernon Henry Wilson | 80 | |||
| Svensson, Mr. Johan | 74 | |||
| Goldschmidt, Mr. George B | 71 | |||
| Artagaveytia, Mr. Ramon | 71 | |||
| Connors, Mr. Patrick | 70 | |||
| Mitchell, Mr. Henry Michael | 70 | |||
| Crosby, Capt. Edward Gifford | 70 | |||
| Wheadon, Mr. Edward H | 66 | |||
| Ostby, Mr. Engelhart Cornelius | 65 | |||
| Duane, Mr. Frank | 65 |
Details of Oldest Male on Titanic
sqldf("select * from titanic where Name='Barkworth, Mr. Algernon Henry Wilson'")
PassengerId <int> | Survived <chr> | Pclass <fctr> | Name <fctr> | Sex <fctr> | Age <dbl> | SibSp <int> | Parch <int> | Ticket <fctr> | Fare <dbl> | |
|---|---|---|---|---|---|---|---|---|---|---|
| 631 | Survived | 1 | Barkworth, Mr. Algernon Henry Wilson | male | 80 | 0 | 0 | 27042 | 30 |
=> We can see from table that Mr Algernon Henry Wilson Barkworth is the oldest male on Ship and he survived.
Q15)Distribution of Aged Males
g17<-ggplot(data = Top_10_Aged_Male,aes(Age)) +
geom_histogram(fill=rainbow(4),col="black",binwidth = 5) +
theme_dark() +
labs(title = "Aged People Age Distribution")
ggplotly(g17)
Q16)Top 10 Aged People in Females
Top_10_Aged_Female<- titanic %>%
select(Name,Sex,Age) %>%
filter(Sex=="female") %>%
arrange(desc(Age)) %>%
select(Name,Age) %>%
top_n(10)
## Selecting by Age
Top_10_Aged_Female$Age<- round(Top_10_Aged_Female$Age)
Top_10_Aged_Female
Name <fctr> | Age <dbl> | |||
|---|---|---|---|---|
| Andrews, Miss. Kornelia Theodosia | 63 | |||
| Turkula, Mrs. (Hedwig) | 63 | |||
| Stone, Mrs. George Nelson (Martha Evelyn) | 62 | |||
| Warren, Mrs. Frank Manley (Anna Sophia Atkinson) | 60 | |||
| Bonnell, Miss. Elizabeth | 58 | |||
| Lurette, Miss. Elise | 58 | |||
| Graham, Mrs. William Thompson (Edith Junkins) | 58 | |||
| Mack, Mrs. (Mary) | 57 | |||
| Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | 56 | |||
| Hewlett, Mrs. (Mary D Kingcome) | 55 |
sqldf("select * from titanic where Name='Andrews, Miss. Kornelia Theodosia' ")
PassengerId <int> | Survived <chr> | Pclass <fctr> | Name <fctr> | Sex <fctr> | Age <dbl> | SibSp <int> | Parch <int> | Ticket <fctr> | |
|---|---|---|---|---|---|---|---|---|---|
| 276 | Survived | 1 | Andrews, Miss. Kornelia Theodosia | female | 63 | 1 | 0 | 13502 |
=> We can see from table that Miss. Kornelia Theodosia, Andrews is the oldest female on Ship and she survived.
Q17)Relation between survived and fare wrt Pclass
g18<-ggplot(data = titanic,aes(Survived,log(Fare),fill=Pclass)) +
theme_excel() +
geom_violin() +
labs(y="Fare Distribution",title = "Distribution of Fare and Survival")
ggplotly(g18)
#=> By seeing the graph of Fare and survival, We can conclude that as the PClass class is increase towards 1, Fare is more expensive and they have more chances of survival.
Q18)Relation between Pclass and Embarked wrt Survival
g19<-ggplot(data = titanic,aes(Pclass,fill=Survived)) +
theme_dark()+
facet_wrap(~Embarked) +
geom_bar(col="black") +
labs(title = "Distribution of PClass and Embarked wrt Survival")
ggplotly(g19)
1)=> By seeing the graph, We can conclude that for all Embarked people in PClass 1 survived more compared to other PClass and Embarked.
2)=> We can also see most of people who died in titanic, boarded their ship from vSouthampton and are from PClass 3.
Q19)Relation between Sex and Embarked wrt Survived
g20<- ggplot(data = titanic,aes(Sex,fill=Survived)) +
theme_excel()+
facet_wrap(~Embarked) +
geom_histogram(stat="count",col="black") +
labs(title = "Distribution of Sex and Embarked wrt Survived ")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(g20)
=> By seeing the graph, We can conclude that Females survived from all 3 Embarked and males died more from vSouthampton and we can predict that most died Males may be belonged to Pclass 3 as its survival rate is less.
Q20)Relation between Sex and Embarked wrt Pclass
g21<-ggplot(data = titanic,aes(Sex,fill=Pclass)) +
theme_fivethirtyeight()+
facet_wrap(~Embarked) +
geom_bar(col="black") +
labs(title = "Distribution of Sex and Embarked wrt Pclass")
ggplotly(g21)
## Warning: plotly.js does not (yet) support horizontal legend items
## You can track progress here:
## https://github.com/plotly/plotly.js/issues/53
1)=> By seeing the graph, We can conclude that for Cherboung has almost same distribution of Pclass among both Sex categories.
2)=> In Queenstown almost all people in both Sex categories boarded ther ticket in PClass 3.
3)=> In vSouthampton males have slight edge for buying Pclass 1 ticket over females. And more males have buyed Pclass 3 ticket. Our prediction was right in the last question.
Q21)Relation between Age and Embarked wrt Pclass
g22<-ggplot(data = titanic,aes(Pclass,Age,fill=Pclass)) +
theme_dark()+
facet_wrap(~Embarked) +
geom_count() +
ylim(c(0,80))+
ylab("Age Distribution") +
labs(title = "Distribution of Age and Embarked wrt Pclass")
ggplotly(g22)
1)=> By seeing the graph, We can see the different age range for buying different Pclass tickets. The thicker areas somewhere means people of same age are more who brought that Ticket.
2)=> We can also see for vSouthampton, for Pclass 1 age category is fully covered from 1 to 80 i.e from a small child to an oldage all types of people are there.
3)=> We can also see for vSouthampton, for Pclass 3 age category the area is very thicker between age 17 to 50 that concludes this range people have buyed more PClass 3 tickets.
Q22)Relation between PClass Age Sex Embarked
g23<-ggplot(data = titanic,aes(x=Age,fill=Pclass)) +
theme_get()+
facet_wrap(Sex~Embarked) +
geom_bar(stat="count") +
labs(title="Distribution of Age wrt PClass Sex Embarked")
ggplotly(g23)
#=> By seeing the graph, We can conclude the same thing which we got from the last graph.
Q23)Relation between PClass Survived Sex Embarked
g24<-ggplot(data = titanic,aes(Pclass,fill=Sex)) +
theme_igray()+
facet_wrap(Survived~Embarked) +
geom_bar(col="black") +
scale_y_continuous(limits = c(0,80),breaks = seq(0,80,20)) +
labs(title = "Distribution of Pclass wrt Survived Sex Embarked ")
ggplotly(g24)
=> By seeing the graph, We can see the similar conclution we got earlier that males died more than female of PClass 1 and PClass 2. And in Survival also female survived more than males in all PClass tickets.
Q24)Relation between Fare and Embarked
g25<-ggplot(data = titanic,aes(Fare,Embarked)) +
theme_bw() +
geom_jitter(stat = "identity",col="red") +
geom_smooth(method = lm) +
labs(title="Relation between Fare and Embarked")
ggplotly(g25)
1)=> By seeing the plot, the line shows the fare range as per differnt Embarked.
2)=> For Cherboung the variation of Fare is pretty high. Distribution of Fare are more between (0,100) so that must be Pclass 3 ticket. There are some more distribution above 100 upto 500 that shows Pclass 2 ticket will be aroung (100,250) and furture is Pclass 3
3)#=> For Queenstown the distribution of Fare are mostly between (0,100).We can conclude people boarded from there are less and brought PClass 3 tickets more compared to other tickets**
4)#=> For vSouthapton we see too much overplotting between (0,100) so that must be Pclass 3 ticket. There are some more distribution above 100 upto 260 approx.
5)#=>It gives an important conclution that PClass ticket prices vary for differnt Embarked as we have seen Pclass 1 ticket for vSouthampton but max price is only 260 here. And there may be discrimination of Fare based on Gender.
Q25)Ratio of Survival of Sex
titanic2 <- read.csv("C:/Users/Hp/Desktop/titanic.csv",sep = ",",stringsAsFactors = TRUE,na.strings = NA,header = T)
#removing the NULL values
titanic2<- na.omit(titanic2)
dim(titanic2)
## [1] 714 12
#We can use sapply function to get the nº of missing values in our dataset
sapply(titanic2,function(x) sum(is.na(x)))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
#converting to factors
titanic2$Pclass <- as.factor(titanic2$Pclass)
titanic2$Sex <- as.factor(titanic2$Sex)
titanic2$Embarked <- as.factor(titanic2$Embarked)
titanic2$Embarked<- ifelse(titanic2$Embarked=="C","Cherbourg",(ifelse(titanic2$Embarked=="Q","Queenstown","vSouthampton")))
survival<- titanic2%>%
select(Sex,Survived)%>%
group_by(Sex)%>%
summarise(survival_rate = mean(Survived))
survival
Sex <fctr> | survival_rate <dbl> | |||
|---|---|---|---|---|
| female | 0.7547893 | |||
| male | 0.2052980 |
We can see that Survival ration of female to male is nearly 3:1
Finding Survival Rate of Females to Males
survival<- titanic2%>%
select(Sex,Survived)%>%
group_by(Sex)%>%
summarise(survival_rate = mean(Survived))
survival
Sex <fctr> | survival_rate <dbl> | |||
|---|---|---|---|---|
| female | 0.7547893 | |||
| male | 0.2052980 |
We can see that Survival ration of female to male is nearly 3:1
Q26)Distribution of Sex and Survival
g26<- ggplot(data = titanic,aes(Sex,fill=Survived)) +
theme_base() +
geom_bar(col="black") +
labs(title="Distribution of Sex and Survival")
ggplotly(g26)
The graphs tells that survival rate of female is more than male
Q27)Distribution of Fare vs Survival
g27<-ggplot(data = titanic,aes(Fare,fill=Survived)) +
theme_economist() +
geom_histogram(bins = 30,col="black") +
scale_y_continuous(limits = c(0,120)) +
labs(title = "Distribution of Fare wrt to Survival")
ggplotly(g27)
The graph tells that, as fare increases Survival increases
Q28)Distribution of survival specific to sex
g28<-ggplot(data = titanic,aes(Fare,fill=Survived)) +
theme_stata() +
facet_wrap(~Sex)+
geom_histogram(bins = 30,col="black") +
scale_y_continuous(limits = c(0,90),breaks =seq(0,90,30)) +
labs(x="Fare",y="Survival Count",title = "Distribution of survival specific to sex")
ggplotly(g28)
As fare increases Survival increases in case of females but not males and expensive fare tickets are more likely brought by men.
Q29)Distribution of survival wrt SibSp
g29<-ggplot(data = titanic,aes(SibSp,fill=Survived)) +
theme_solarized_2() +
geom_bar(col="black")+
scale_y_continuous(limits = c(0,500),breaks =seq(0,500,100)) +
labs(title = "Distribution of SibSp wrt Survived")
ggplotly(g29)
The graph tells that, as the siblings increase the chances of survival decrease
Q30)Distribution of survival wrt Parch
g30<-ggplot(data = titanic,aes(Parch,fill=Survived)) +
theme_solarized_2() +
geom_bar(col="black")+
scale_y_continuous(limits = c(0,200),breaks =seq(0,200,50)) +
labs(title = "Distribution of Parch wrt Survived")
ggplotly(g30)
The graph tells that, as the family increase the chances of survival decrease
Q31)Relationship between family size and survivals
# Adding a calculated field column to dataset
titanic = titanic%>%
mutate(family_size = SibSp + Parch)
g31<-ggplot(data = titanic,aes(family_size,fill=Survived)) +
theme_base() +
geom_bar(col="black")+
labs(title = "Distribution of Family Size wrt Survived",y="Survival Count") +
scale_y_continuous(limits = c(0,450),breaks = seq(0,450,100))
ggplotly(g31)
The graph tells that, as family size increase survival rate decreases.
Q32)Distribution of cabin locations.
A = titanic%>%
select(Cabin) %>%
group_by(Cabin) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
top_n(8)
## Selecting by total
# Filling the missing column value with a variable
#class(A)
A=as.matrix(A)
A[1,1] <- "X"
A=as.data.frame(A)
#class(A)
A
Cabin <fctr> | total <fctr> | |||
|---|---|---|---|---|
| X | 523 | |||
| B96 B98 | 4 | |||
| C23 C25 C27 | 4 | |||
| G6 | 4 | |||
| C22 C26 | 3 | |||
| D | 3 | |||
| F2 | 3 | |||
| F33 | 3 |
g32<-ggplot(data = A,aes(A$Cabin,A$total)) +
theme_dark() +
geom_histogram(stat = "identity",fill=rainbow(8),col="black") +
labs(title = "Distribution of Cabin")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(g32)
1)The graph tells that, most of Cabins has the space of 3 to 4 people living.
2)The X cabin represents the missing names of Cabin.
Q32)Top 5 Highest Tickets Sell
top_5_highest_ticket_sell <- sqldf("select Ticket,count(Ticket) as total from titanic2 group by Ticket order by total desc limit 5")
top_5_highest_ticket_sell
Ticket <fctr> | total <int> | |||
|---|---|---|---|---|
| 347082 | 7 | |||
| 3101295 | 6 | |||
| 347088 | 6 | |||
| CA 2144 | 6 | |||
| 382652 | 5 |
#Ploting Graph
g33<-ggplot(data = top_5_highest_ticket_sell ,aes(Ticket,total)) +
theme_economist_white() +
geom_bar(stat="identity",fill=rainbow(5) ,col="black") +
ylim(c(0,8)) +
labs(title = "Top 5 ticket Sold")
ggplotly(g33)
The graph tells that, ticket number 347082 is the highest sold ticket.
Q33)Top 10 Initials Sir Names (Can help to predict survival)
titanic2<-separate(data = titanic2,col =Name,into= c("LastName", "Initial","FirstName","MiddleName"),sep = " ")
## Warning: Expected 4 pieces. Additional pieces discarded in 172 rows [2,
## 4, 8, 9, 14, 15, 17, 23, 30, 32, 33, 34, 36, 39, 40, 66, 77, 93, 95,
## 105, ...].
## Warning: Expected 4 pieces. Missing pieces filled with `NA` in 214 rows
## [3, 11, 16, 19, 31, 42, 43, 46, 47, 49, 53, 57, 58, 61, 62, 65, 69, 70, 74,
## 78, ...].
top_10_Initial_Sir_Names<- titanic2%>%
select(Initial,Survived) %>%
group_by(Initial) %>%
summarise(total = n(),survived_total= sum(Survived)) %>%
arrange(desc(total)) %>%
top_n(5)
## Selecting by survived_total
top_10_Initial_Sir_Names
Initial <chr> | total <int> | survived_total <int> | ||
|---|---|---|---|---|
| Mr. | 385 | 66 | ||
| Miss. | 143 | 104 | ||
| Mrs. | 104 | 83 | ||
| Master. | 36 | 21 | ||
| Dr. | 6 | 3 |
#Ploting the Graph
g34<-ggplot(data = top_10_Initial_Sir_Names,aes(Initial,total,fill=Initial)) +
theme_foundation() +
geom_count()+
labs(title = "Distribution of Sir Names")
ggplotly(g34)
=>1)The graph tells that, Sirname ‘Mr’ are more compared to others.
2)Sirname ‘Dr’ there were very less Doctors that that taveled.
3)Sirname with ‘Miss’ is the 2nd most travelled people that shows there were many unmarried people also.
Q33)Total Vs Survival Count
barplot(top_10_Initial_Sir_Names$total,ylim = c(0,450),col=rainbow(5),legend = top_10_Initial_Sir_Names$Initial)
lines(top_10_Initial_Sir_Names$total,type="o",col="blue") +
lines(top_10_Initial_Sir_Names$survived_total,type="o",col="white") +
title("Total Vs Survival Count")
## integer(0)
The bar and line plot shows the total Sir name count wrt their survival count.
Q35) We can find the Relations
##relation between Survival PClass
titanic2$Pclass<-as.numeric(titanic2$Pclass)
cor(titanic2$Survived,titanic2$Pclass)
## [1] -0.3596527
The correlation between Survival and Pclass is moderate negative. That shows as PClass increases From 1 to 2 to 3 , Survival rate decreases.
##relation between Survival Fare
cor(titanic2$Survived,titanic2$Fare)
## [1] 0.2681886
The correlation between Survival and Pclass is moderate positive. That shows as Fare increases, Survival rate also increases.
Q36Number of ‘Child’,‘Adult’,‘Elder’ on the ship
# Creating a range distribuion of Age
age_categories <- cut(titanic$Age,breaks = c(0,20,50,80),labels = c("Children","Adults","OldAged"))
table(age_categories)
## age_categories
## Children Adults OldAged
## 178 461 64
titanic$age_category<- age_categories
titanic<- na.omit(titanic)
nrow(titanic)
## [1] 703
sapply(titanic,function(x) sum(is.na(x)))
## PassengerId Survived Pclass Name Sex
## 0 0 0 0 0
## Age SibSp Parch Ticket Fare
## 0 0 0 0 0
## Cabin Embarked fare_category family_size age_category
## 0 0 0 0 0
g37<-ggplot(titanic,aes(age_category)) +
theme_pander() +
geom_bar(aes(fill=age_category),col="black") +
scale_y_continuous(limits = c(0,500),breaks = seq(0,500,100)) +
labs(title = "Distribution of Age Category")
ggplotly(g37)
=> By seeing the graph of Age category, We can conclude that most of the people are from Adult Category.
Q37Age Categories based on the Embarked wrt to Sex
#Using DPLYR
age_cateory_by_Embarked_Sex<- titanic%>%
select(age_category,Embarked,Sex) %>%
group_by(age_category,Embarked,Sex)%>%
summarise(total = n())
age_cateory_by_Embarked_Sex
age_category <fctr> | Embarked <chr> | Sex <fctr> | total <int> | |
|---|---|---|---|---|
| Children | Cherbourg | female | 21 | |
| Children | Cherbourg | male | 10 | |
| Children | Queenstown | female | 5 | |
| Children | Queenstown | male | 5 | |
| Children | vSouthampton | female | 51 | |
| Children | vSouthampton | male | 86 | |
| Adults | Cherbourg | female | 33 | |
| Adults | Cherbourg | male | 47 | |
| Adults | Queenstown | female | 7 | |
| Adults | Queenstown | male | 8 |
#Using GGPLOT2
g38<- ggplot(titanic,aes(age_category,fill=Sex)) +
theme_dark() +
facet_wrap(~Embarked) +
geom_histogram(stat="count",col="black") +
labs(title = "Age Categories based on the Embarked wrt to Sex")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(g38)
=> By seeing the graph, We can conclude that most of the most of male People and specially adults boarded titanic from vSouthampton.
Q38Age Categories based on the Embarked wrt to Pclass
#Using DPLYR
age_cateory_by_Embarked_Pclass<- titanic%>%
select(age_category,Embarked,Pclass) %>%
group_by(age_category,Embarked,Pclass)%>%
summarise(total = n())
age_cateory_by_Embarked_Pclass
age_category <fctr> | Embarked <chr> | Pclass <fctr> | total <int> | |
|---|---|---|---|---|
| Children | Cherbourg | 1 | 7 | |
| Children | Cherbourg | 2 | 4 | |
| Children | Cherbourg | 3 | 20 | |
| Children | Queenstown | 3 | 10 | |
| Children | vSouthampton | 1 | 14 | |
| Children | vSouthampton | 2 | 31 | |
| Children | vSouthampton | 3 | 92 | |
| Adults | Cherbourg | 1 | 49 | |
| Adults | Cherbourg | 2 | 11 | |
| Adults | Cherbourg | 3 | 20 |
#Using GGPLOT2
g39<-ggplot(titanic,aes(Pclass)) +
theme_dark() +
facet_wrap(~Embarked) +
geom_histogram(stat="count",aes(fill=age_category),col="black") +
labs(title = "Age Categories based on the Embarked wrt to Pclass")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(g39)
1)=> By seeing the graph, We can see that mostly Adults traveled the titanic having any of the PClass ticket.
2)=> Most of the childrens boarded titanic from vSouthampton and with PClass 3. 3)=> OldAge people are less and have traveled from all the Embarked and mostly from vSouthampton.
Q38Age Categories based on the Embarked wrt to Sex and Pclass
#Using DPLYR
age_cateory_by_Embarked_Sex_Pclass<- titanic%>%
select(age_category,Sex,Embarked,Pclass) %>%
group_by(age_category,Sex,Embarked,Pclass)%>%
summarise(total = n())
age_cateory_by_Embarked_Sex_Pclass
age_category <fctr> | Sex <fctr> | Embarked <chr> | Pclass <fctr> | total <int> |
|---|---|---|---|---|
| Children | female | Cherbourg | 1 | 5 |
| Children | female | Cherbourg | 2 | 3 |
| Children | female | Cherbourg | 3 | 13 |
| Children | female | Queenstown | 3 | 5 |
| Children | female | vSouthampton | 1 | 9 |
| Children | female | vSouthampton | 2 | 13 |
| Children | female | vSouthampton | 3 | 29 |
| Children | male | Cherbourg | 1 | 2 |
| Children | male | Cherbourg | 2 | 1 |
| Children | male | Cherbourg | 3 | 7 |
#Using GGPLOT
g40<-ggplot(titanic,aes(Pclass,fill=age_category)) +
theme_dark() +
facet_wrap(Sex~Embarked) +
geom_bar(col="black") +
labs(title = "Age Categories based on the Embarked wrt to Sex and Pclass")
ggplotly(g40)
1)=> By seeing the graph, We can see that in Cherboung and Queenstown the distribution of Age cateogory among males and females are almost wrt to PClass.
2)=> In vSouthampton Adult Males boarded more than Adult Females wrt to all PClass. And this is same for other age cateogories also as males of all category are more than females.
Q39Age Categories based on the Embarked wrt to Survived
#Using DPLYR
age_cateory_by_Embarked_Survived<- titanic%>%
select(age_category,Embarked,Survived) %>%
group_by(age_category,Embarked,Survived)%>%
summarise(total = n())
age_cateory_by_Embarked_Survived
age_category <fctr> | Embarked <chr> | Survived <chr> | total <int> | |
|---|---|---|---|---|
| Children | Cherbourg | Not Survivied | 9 | |
| Children | Cherbourg | Survived | 22 | |
| Children | Queenstown | Not Survivied | 6 | |
| Children | Queenstown | Survived | 4 | |
| Children | vSouthampton | Not Survivied | 82 | |
| Children | vSouthampton | Survived | 55 | |
| Adults | Cherbourg | Not Survivied | 35 | |
| Adults | Cherbourg | Survived | 45 | |
| Adults | Queenstown | Not Survivied | 11 | |
| Adults | Queenstown | Survived | 4 |
#Using GGPLOT
g41<-ggplot(titanic,aes(Survived,fill=age_category)) +
theme_dark() +
facet_wrap(~Embarked) +
geom_histogram(stat="count",col="black") +
labs(title = "Age Categories based on the Embarked wrt to Survived")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(g41)
1)=> By seeing the graph, We can see that in all Embarked Adults are survived most and next are childrens.
2)=> The result is same in case of death as Adults have died more and the next is Children.
Q40) Age Categories based on the Embarked wrt to Sex Survived and Pclass
#Using DPLYR
age_cateory_by_Embarked_Sex_Survived_Pclass<- titanic%>%
select(age_category,Embarked,Sex,Pclass,Survived) %>%
group_by(age_category,Embarked,Sex,Pclass,Survived)%>%
summarise(total = n())
head(age_cateory_by_Embarked_Sex_Survived_Pclass)
age_category <fctr> | Embarked <chr> | Sex <fctr> | Pclass <fctr> | Survived <chr> | total <int> |
|---|---|---|---|---|---|
| Children | Cherbourg | female | 1 | Survived | 5 |
| Children | Cherbourg | female | 2 | Survived | 3 |
| Children | Cherbourg | female | 3 | Not Survivied | 4 |
| Children | Cherbourg | female | 3 | Survived | 9 |
| Children | Cherbourg | male | 1 | Not Survivied | 1 |
| Children | Cherbourg | male | 1 | Survived | 1 |
#Using GGPLOT
g42<-ggplot(titanic,aes(Pclass,fill=age_category)) +
theme_dark() +
facet_wrap(Sex~Embarked) +
geom_bar(stat="count",col="black") +
labs(title = "Age Categories based on the Pclass wrt Embarked,Sex")
ggplotly(g42)
=> By seeing the graph, We get similar results which we got before.
Q41)Age Categories based on the Pclass wrt Embarked,Survived
#Using DPLYR
age_cateory_by_Embarked_Survived_Pclass<- titanic%>%
select(age_category,Embarked,Pclass,Survived) %>%
group_by(age_category,Embarked,Pclass,Survived)%>%
summarise(total = n())
head(age_cateory_by_Embarked_Survived_Pclass)
age_category <fctr> | Embarked <chr> | Pclass <fctr> | Survived <chr> | total <int> |
|---|---|---|---|---|
| Children | Cherbourg | 1 | Not Survivied | 1 |
| Children | Cherbourg | 1 | Survived | 6 |
| Children | Cherbourg | 2 | Survived | 4 |
| Children | Cherbourg | 3 | Not Survivied | 8 |
| Children | Cherbourg | 3 | Survived | 12 |
| Children | Queenstown | 3 | Not Survivied | 6 |
#Using GGPLOT
g43<-ggplot(titanic,aes(Pclass,fill=age_category)) +
theme_dark() +
facet_wrap(Survived~Embarked) +
geom_bar(stat="count",col="black") +
labs(title = "Age Categories based on the Pclass wrt Embarked,Survived")
ggplotly(g43)
=> By seeing the graph, We can see the similar results as before.
Q42)Age Categories based on the Embarked wrt to Fare
g44<-ggplot(titanic,aes(age_category,Fare)) +
theme_stata() +
facet_wrap(~Embarked) +
geom_violin(color="green",fill="blue")+
labs(title = "Age Categories based on the Embarked wrt to Fare")
ggplotly(g44)
1)=> By seeing the graph, We can see that fares ranges with huge margin for age category wrt differnt Embarked.
2)=> We can conclude that there is discrimination of fares among different Embarked for diffrent age categories specially Adults.
Q43) Age Categories based on the Embarked wrt to Fare and Pclass
g45<-ggplot(titanic,aes(age_category,Fare,fill=Pclass)) +
theme_economist() +
facet_wrap(~Embarked) +
geom_violin() +
labs(title = "Age Categories based on the Embarked wrt to Fare and Pclass")
ggplotly(g45)
=> By seeing the graph, We can see that fares ranges age category wrt PClass and differnt Embarked.
=>PClass 3 rates are somewhat similar there is not a huge discrimination of fare in this. Other PClass have some discrimiantion wrt Embarked and Age Category.
Q44) Age Categories based on the Survived wrt to Fare and Pclass
g46<-ggplot(titanic,aes(age_category,Fare,fill=Pclass)) +
theme_economist() +
facet_wrap(~Survived) +
geom_violin(col="brown") +
scale_y_continuous(limit=c(0,300))+
labs(title = "Age Categories based on the Survived wrt to Fare and Pclass")
ggplotly(g46)
=> By seeing the graph, We can see in Survival we see our previous result that with high PClass i.e 1 the probability of survial increasesa as they pay more compared to others. And it decreases as the PClass goes down as they pay less.